## ---------------------------
##
## Script name: Replication files for: "Predictive Inference with Random Forests" by McAlexander & Mentch
##
## Author: Richard McAlexander & Lucas Mentch
##
## Date Created: 12/11/2019
##
## Email: richardmcalexander@gmail.com
##
## ---------------------------

#LOAD PACKAGES
library(rpart)
library(ibdreg)
library(MASS)
library(tidyverse)
library(haven)
library(surfin)
library(randomForest)

#SET SEED
set.seed(12345)

#LOAD DATA
fearonlaitin <- read_dta("fearonlaitin.dta",encoding='latin1') %>% as.data.frame()

#FORMAT FEARON & LAITIN DATA
fearonlaitin <- dplyr::select(fearonlaitin, onset, warl, gdpenl, lpop, lmtnest, ncontig, Oil, nwstate, instab, polity2l, ethfrac, relfrac)  %>% na.omit()
fearonlaitin <- filter(fearonlaitin,onset!=4)


#MAKE CODE FOR BAGGED.INF

# The function below will generate confidence intervals (return lower and upper bounds; based on the full feature set) for all points in the test set, as well as perform a test for significance for the specified features.
Bagged.Inf <- function(data,testvars,verbose=FALSE,k=75,nx1=50,nmc=1000,samplesize,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,nulltest=FALSE) {
  # train -- training set (data frame)
  # test -- test set (data frame)
  # testvars -- index of columns in the data frame corresponding to variables being tested
  # verbose -- if TRUE, outputs progress
  # k -- size of each subsample
  # nx1 -- variance estimation parameter; number of estimates of conditional expectation
  # nmc -- variance estimation parameter; number of monte carlo samples used to estimate conditional expectation
  # minsplit -- the minimum number of observations needed in a leaf to be eligible for splitting
  # maxcompete -- number of "back-up" splits to keep track of
  # maxsurrogate -- number of surrogate splits to consider
  # usesurrogate -- if nonzero, this helps deal with missing data
  
  # Defining rpart Control Parameters
  control.sim <- rpart.control(minsplit=minsplit,maxcompete=maxcompete,maxsurrogate=maxsurrogate,usesurrogate=usesurrogate)
  
  # Defining the size of the training set and ensemble
  
  m <- nx1*nmc
  
  # Defining the reduced data:
  #train.red <- train[,-testvars]
  #test.red <- test[,-testvars]
  
  
  if(nulltest == TRUE)
    #data$null <- data[,testvars][sample(1:nrow(data),nrow(data),replace=FALSE)]
    data$null = rnorm(nrow(data),1)
  
  sample_rows_bw <- sample(1:nrow(data),samplesize,replace=FALSE)
  test <- data[sample_rows_bw,]
  train <- data[-sample_rows_bw,]
  n <- dim(train)[1]
  train.red <- train
  #permute
  #train.red[,testvars] <- train[,testvars][sample(nrow(train),replace=FALSE)]
  
  perm <- sample(n)
  for (i in 1:dim(train.red)[2]) {
    if (i %in% testvars) train.red[,i] <- train.red[,i][perm]
  }
  
  
  # Build the trees and estimate the parameters:
  pred.all <- matrix(0,nrow=1,ncol=dim(test)[1])
  diff.all <- matrix(0,nrow=1,ncol=dim(test)[1])
  cond.exp.full <- matrix(0,nrow=nx1,ncol=dim(test)[1])
  cond.exp.diff <- matrix(0,nrow=nx1,ncol=dim(test)[1])
  for (i in 1:nx1) {
    ind.x1 <- sample(1:dim(train)[1],size=1,replace=FALSE)
    pred.full <- matrix(0,nrow=nmc,ncol=dim(test)[1])
    pred.red <- matrix(0,nrow=nmc,ncol=dim(test)[1])
    pred.diff <- matrix(0,nrow=nmc,ncol=dim(test)[1])
    for (j in 1:nmc) {
      ind <- c(ind.x1,sample((1:dim(train)[1])[-ind.x1],k-1,replace=FALSE))
      ss.full <- train[ind,]	
      ss.red <- train.red[ind,]
      tree.full <- rpart(onset~.,data=ss.full,control=control.sim)
      tree.red <- rpart(onset~.,data=ss.red,control=control.sim)
      pred.full[j,] <- predict(tree.full,test)
      pred.red[j,] <- predict(tree.red,test)
      pred.diff[j,] <- pred.full[j,] - pred.red[j,]
      if (verbose) cat("nx1:  ",i,"          nmc:  ",j,"\n")
    }
    pred.all <- rbind(pred.all,pred.full)
    diff.all <- rbind(diff.all,pred.diff)
    cond.exp.full[i,] <- apply(pred.full,2,mean)
    cond.exp.diff[i,] <- apply(pred.diff,2,mean)
  }
  pred.all <- pred.all[-1,]
  diff.all <- diff.all[-1,]
  
  mean.full <- apply(pred.all,2,mean)
  mean.diff <- apply(diff.all,2,mean)
  
  zeta1.full <- apply(cond.exp.full,2,var)
  zeta1.diff <- cov(cond.exp.diff)
  
  zetak.full <- apply(pred.all,2,var)
  zetak.diff <- cov(diff.all)
  
  sd.full <- sqrt((m/n)*((k^2)/m)*zeta1.full + (1/m)*zetak.full)
  lbounds <- qnorm(0.025,mean=mean.full,sd=sd.full)
  ubounds <- qnorm(0.975,mean=mean.full,sd=sd.full)
  
  tstat <- t(mean.diff) %*% ginv((m/n)*((k^2)/m)*zeta1.diff + (1/m)*zetak.diff) %*% mean.diff
  pval <- 1-pchisq(tstat,df=dim(test)[1])
  # lbounds -- lower bounds for the confidence intervals
  # ubounds -- upper bounds for the confidence intervals
  # pred -- predictions from the ensemble at the test points (center of the confidence intervals)
  # tstat -- test statistic from the test for significance
  # pval -- pvalue from the test for significance
  return(list("lbounds"=lbounds,"ubounds"=ubounds,"pred"=mean.full,"tstat"=tstat,"pval"=pval))
}


#SET SAMPLESIZE
samplesize = 20

#MAKE FUNCTION TO GET MEAN P-VALUE FROM SAMPLES
get_mean_p_value <- function(x,df){
  dim(x)[1]
  for(i in 1:length(seq(4,(dim(x)[1]*dim(x)[2]),by=5))){
    meanvect[i] <- x[[seq(4,(dim(x)[1]*dim(x)[2]),by=5)[i]]]
  }
  1-pchisq(mean(meanvect),df=df)
}


#THIS DOES THE SIGNIFICANCE TESTS FOR EACH OF THE VARIABLES OF INTEREST.
#IT WILL TAKE A WHILE
#ELF
fearonlaitin.p_value_elf <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(11),verbose=FALSE,k=75,nx1=50,nmc=800,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
fearonlaitin.p_value_elf
mean(unlist(fearonlaitin.p_value_elf[5,]))
1-pchisq(mean(unlist(fearonlaitin.p_value_elf[4,])),df=20)

#polity
fearonlaitin.p_value_polity <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(10),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_polity[4,])),df=20)

#relfrac
fearonlaitin.p_value_relfrac <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(12),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_relfrac[4,])),df=20)

#mountains
fearonlaitin.p_value_mountains <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(5),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_mountains[4,])),df=20)

#test for joint significance of elf and relfrac
fearonlaitin.p_value_relfrac_and_elf <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(11,12),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_relfrac_and_elf[4,])),df=20)

#newstate
fearonlaitin.p_value_newstate <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(8),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
get_mean_p_value(fearonlaitin.p_value_newstate,20) 
1-pchisq(mean(unlist(fearonlaitin.p_value_newstate[4,])),df=20) 

#warl
fearonlaitin.p_value_warl <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(2),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_warl[4,])),df=20)

#gdpl
fearonlaitin.p_value_gdpl <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(3),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_gdpl[4,])),df=20) 

#lpop
fearonlaitin.p_value_lpop <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(4),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_lpop[4,])),df=20)

#contig
fearonlaitin.p_value_contig <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(5),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_contig[4,])),df=20) 

#oil
fearonlaitin.p_value_oil <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(7),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_oil[4,])),df=20) 

#instab
fearonlaitin.p_value_instab <- replicate(100,Bagged.Inf(data=fearonlaitin,testvars=c(7),verbose=FALSE,k=75,nx1=50,nmc=1000,minsplit=3,maxcompete=0,maxsurrogate=0,usesurrogate=0,samplesize=20,nulltest=FALSE))
1-pchisq(mean(unlist(fearonlaitin.p_value_instab[4,])),df=20) 

#THESE P-VALUE ARE PUT INTO TABLE 1 MANUALLY.

##### make predicted probability plots

#make train/test data
y = fearonlaitin$onset
x = as.matrix(fearonlaitin[,-1])
n = nrow(fearonlaitin)

train = sample(1:n,n*0.7)
test = setdiff(1:n,train)
xtrain = x[train,]
ytrain = y[train]
xtest = x[test,]
ytest = y[test]

b_value <- 20

#fit the model
fit = forest(xtrain,ytrain,var.type="ustat",B=b_value,ntree=b_value*10000)


#make predictions and generate plots
#lmtnest
newdata1 <-  as.matrix(data.frame(lmtnest = seq(min(fearonlaitin$lmtnest),max(fearonlaitin$lmtnest),length.out=100),
                                  warl=mean(fearonlaitin$warl),
                                  gdpenl = mean(fearonlaitin$gdpenl),
                                  lpop = mean(fearonlaitin$lpop),
                                  ncontig = mean(fearonlaitin$ncontig),
                                  Oil = mean(fearonlaitin$Oil),
                                  nwstate = mean(fearonlaitin$nwstate),
                                  instab = mean(fearonlaitin$instab),
                                  polity2l = mean(fearonlaitin$polity2l),
                                  ethfrac = mean(fearonlaitin$ethfrac),
                                  relfrac = mean(fearonlaitin$relfrac)))

y_hat <- predict(fit,newdata=newdata1,individualTrees=TRUE)
y_hat_var <- forest.varU(predictedAll = y_hat$predictedAll, object = fit)

var_df <- data.frame(
  x <- newdata1[,1],
  y <- y_hat_var$y.hat,
  ub <- y_hat_var$y.hat+1.96*y_hat_var$var.hat,
  lb <- y_hat_var$y.hat-1.96*y_hat_var$var.hat
)


ggplot(data = var_df, 
       aes(x,y)) +
  geom_smooth(se=FALSE)+
  #geom_line()+
  geom_smooth(data=var_df,aes(x,ub),se=FALSE,color="red",alpha=0.5)+
  geom_smooth(data=var_df,aes(x,lb),se=FALSE,color="red",alpha=0.5)+
  ylab("Civil Conflict Onset")+
  xlab("Mountains") 

rm(newdata1,y_hat_var,var_df)
#gdpenl
newdata1 <-  as.matrix(data.frame(lmtnest = mean(fearonlaitin$gdpenl),
                                  warl=mean(fearonlaitin$warl),
                                  gdpenl = seq(min(fearonlaitin$gdpenl),max(fearonlaitin$gdpenl),length.out=100),
                                  lpop = mean(fearonlaitin$lpop),
                                  ncontig = mean(fearonlaitin$ncontig),
                                  Oil = mean(fearonlaitin$Oil),
                                  nwstate = mean(fearonlaitin$nwstate),
                                  instab = mean(fearonlaitin$instab),
                                  polity2l = mean(fearonlaitin$polity2l),
                                  ethfrac = mean(fearonlaitin$ethfrac),
                                  relfrac = mean(fearonlaitin$relfrac)))

y_hat <- predict(fit,newdata=newdata1,individualTrees=TRUE)
y_hat_var <- forest.varU(predictedAll = y_hat$predictedAll, object = fit)

var_df <- data.frame(
  x <- newdata1[,3],
  y <- y_hat_var$y.hat,
  ub <- y_hat_var$y.hat+y_hat_var$var.hat,
  lb <- y_hat_var$y.hat-y_hat_var$var.hat
)

ggplot(data = var_df, 
       aes(x,y)) +
  geom_smooth(se=FALSE)+
  #geom_line()+
  geom_smooth(data=var_df,aes(x,ub),se=FALSE,color="red",alpha=0.5)+
  geom_smooth(data=var_df,aes(x,lb),se=FALSE,color="red",alpha=0.5)+
  ylab("Civil Conflict Onset")+
  xlab("GDP")

rm(newdata1,y_hat_var,var_df)
#lpop
newdata1 <-  as.matrix(data.frame(lmtnest = mean(fearonlaitin$gdpenl),
                                  warl = mean(fearonlaitin$warl),
                                  gdpenl = mean(fearonlaitin$gdpenl),
                                  lpop = seq(min(fearonlaitin$lpop),max(fearonlaitin$lpop),length.out=100),
                                  ncontig = mean(fearonlaitin$ncontig),
                                  Oil = mean(fearonlaitin$Oil),
                                  nwstate = mean(fearonlaitin$nwstate),
                                  instab = mean(fearonlaitin$instab),
                                  polity2l = mean(fearonlaitin$polity2l),
                                  ethfrac = mean(fearonlaitin$ethfrac),
                                  relfrac = mean(fearonlaitin$relfrac)))

y_hat <- predict(fit,newdata=newdata1,individualTrees=TRUE)
y_hat_var <- forest.varU(predictedAll = y_hat$predictedAll, object = fit)

var_df <- data.frame(
  x <- newdata1[,4],
  y <- y_hat_var$y.hat,
  ub <- y_hat_var$y.hat+y_hat_var$var.hat,
  lb <- y_hat_var$y.hat-y_hat_var$var.hat
)

ggplot(data = var_df, 
       aes(x,y)) +
  geom_smooth(se=FALSE)+
  #geom_line()+
  geom_smooth(data=var_df,aes(x,ub),se=FALSE,color="red",alpha=0.5)+
  geom_smooth(data=var_df,aes(x,lb),se=FALSE,color="red",alpha=0.5)+
  ylab("Civil Conflict Onset")+
  xlab("Population (Log)")

rm(newdata1,y_hat_var,var_df)

#polity2l
newdata1 <-  as.matrix(data.frame(lmtnest = mean(fearonlaitin$gdpenl),
                                  warl = mean(fearonlaitin$warl),
                                  gdpenl = mean(fearonlaitin$gdpenl),
                                  lpop = mean(fearonlaitin$lpop),
                                  ncontig = mean(fearonlaitin$ncontig),
                                  Oil = mean(fearonlaitin$Oil),
                                  nwstate = mean(fearonlaitin$nwstate),
                                  instab = mean(fearonlaitin$instab),
                                  polity2l = seq(min(fearonlaitin$polity2l),max(fearonlaitin$polity2l),length.out = 100),
                                  ethfrac = mean(fearonlaitin$ethfrac),
                                  relfrac = mean(fearonlaitin$relfrac)))

y_hat <- predict(fit,newdata=newdata1,individualTrees=TRUE)
y_hat_var <- forest.varU(predictedAll = y_hat$predictedAll, object = fit)

var_df <- data.frame(
  x <- newdata1[,9],
  y <- y_hat_var$y.hat,
  ub <- y_hat_var$y.hat+y_hat_var$var.hat,
  lb <- y_hat_var$y.hat-y_hat_var$var.hat
)

ggplot(data = var_df, 
       aes(x,y)) +
  geom_smooth(se=FALSE)+
  #geom_line()+
  geom_smooth(data=var_df,aes(x,ub),se=FALSE,color="red",alpha=0.5)+
  geom_smooth(data=var_df,aes(x,lb),se=FALSE,color="red",alpha=0.5)+
  ylab("Civil Conflict Onset")+
  xlab("Polity")

rm(newdata1,y_hat_var,var_df)

#ethfrac
newdata1 <-  as.matrix(data.frame(lmtnest = mean(fearonlaitin$gdpenl),
                                  warl = mean(fearonlaitin$warl),
                                  gdpenl = mean(fearonlaitin$gdpenl),
                                  lpop = mean(fearonlaitin$lpop),
                                  ncontig = mean(fearonlaitin$ncontig),
                                  Oil = mean(fearonlaitin$Oil),
                                  nwstate = mean(fearonlaitin$nwstate),
                                  instab = mean(fearonlaitin$instab),
                                  polity2l = mean(fearonlaitin$polity2l),
                                  ethfrac = seq(min(fearonlaitin$ethfrac),max(fearonlaitin$ethfrac),length.out = 100),
                                  relfrac = mean(fearonlaitin$relfrac)))

y_hat <- predict(fit,newdata=newdata1,individualTrees=TRUE)
y_hat_var <- forest.varU(predictedAll = y_hat$predictedAll, object = fit)

var_df <- data.frame(
  x <- newdata1[,10],
  y <- y_hat_var$y.hat,
  ub <- y_hat_var$y.hat+y_hat_var$var.hat,
  lb <- y_hat_var$y.hat-y_hat_var$var.hat
)

ggplot(data = var_df, 
       aes(x,y)) +
  geom_smooth(se=FALSE)+
  #geom_line()+
  geom_smooth(data=var_df,aes(x,ub),se=FALSE,color="red",alpha=0.5)+
  geom_smooth(data=var_df,aes(x,lb),se=FALSE,color="red",alpha=0.5)+
  ylab("Civil Conflict Onset")+
  xlab("Ethnolinguistic Fractionalization")

rm(newdata1,y_hat_var,var_df)

#relfrac
newdata1 <-  as.matrix(data.frame(lmtnest = mean(fearonlaitin$gdpenl),
                                  warl = mean(fearonlaitin$warl),
                                  gdpenl = mean(fearonlaitin$gdpenl),
                                  lpop = mean(fearonlaitin$lpop),
                                  ncontig = mean(fearonlaitin$ncontig),
                                  Oil = mean(fearonlaitin$Oil),
                                  nwstate = mean(fearonlaitin$nwstate),
                                  instab = mean(fearonlaitin$instab),
                                  polity2l = mean(fearonlaitin$polity2l),
                                  ethfrac = mean(fearonlaitin$ethfrac),
                                  relfrac = seq(min(fearonlaitin$relfrac),max(fearonlaitin$relfrac),length.out = 100)))

y_hat <- predict(fit,newdata=newdata1,individualTrees=TRUE)
y_hat_var <- forest.varU(predictedAll = y_hat$predictedAll, object = fit)

var_df <- data.frame(
  x <- newdata1[,11],
  y <- y_hat_var$y.hat,
  ub <- y_hat_var$y.hat+y_hat_var$var.hat,
  lb <- y_hat_var$y.hat-y_hat_var$var.hat
)

ggplot(data = var_df, 
       aes(x,y)) +
  geom_smooth(se=FALSE)+
  #geom_line()+
  geom_smooth(data=var_df,aes(x,ub),se=FALSE,color="red",alpha=0.5)+
  geom_smooth(data=var_df,aes(x,lb),se=FALSE,color="red",alpha=0.5)+
  ylab("Civil Conflict Onset")+
  xlab("Religious Fractionalization")

rm(newdata1,y_hat_var,var_df)